1 前言

1.1 主要链接

1.3 新增

  • 4 异常值
  • 3.1 武神数据(EDA)
  • 文件展示
knitr::opts_chunk$set(warning = FALSE, message = FALSE, cache = T)
library(tidyverse)
library(knitr)
library(formattable)
library(skimr)
library(DT)
library(readxl)
library(xgboost)
library(SmartEDA)
library(DT)
library(tidyquant)
library(pryr)
get_path01 <- function(x){
  file.path(getwd(),"Question01",x)}
get_path02 <- function(x){
  file.path(getwd(),"Question01","New_FDDC_financial_data_20180524",x)}
get_path03 <- function(x){
  file.path(getwd(),"Question01","New_FDDC_financial_data_20180524",
            "financial data",x)}
get_path04 <- function(x){
  file.path(getwd(),"ana_data",x)}
theme_ilo <- function(){
    theme_minimal() + 
    theme(
    # text = element_text(family = "Bookman", color = "gray25"),
    plot.subtitle = element_text(size = 9),
    plot.caption = element_text(color = "gray30"),
    # plot.background = element_rect(fill = "gray95"),
    plot.margin = unit(c(5, 10, 5, 10), units = "mm"),
    axis.title.x = element_text(size=12,face = "bold"),
    axis.title.y = element_text(size=12,face = "bold"),
    # x和y的命名,要加粗,ppt才好看
    axis.text.x = element_text(size=7, angle = 70, hjust = 1),
    # 控制axis字体大小,7号大小最好
    axis.text.y = element_text(size=7),
    legend.title=element_blank()
    )
}

2 文件展示

FDDC_financial_submit_20180524 <- 
  read_csv(get_path01("FDDC_financial_submit_20180524.csv"), 
    col_names = FALSE)
FDDC_financial_submit_20180524
## # A tibble: 1,493 x 1
##    X1         
##    <chr>      
##  1 000001.XSHE
##  2 000005.XSHE
##  3 000006.XSHE
##  4 000009.XSHE
##  5 000010.XSHE
##  6 000011.XSHE
##  7 000012.XSHE
##  8 000014.XSHE
##  9 000017.XSHE
## 10 000018.XSHE
## # ... with 1,483 more rows

预测1493个股票的营收。

read_lines(get_path03("Balance Sheet.xls"),n_max = 5)
## [1] "PK\003\004\024"                                                                                                                                                                                                                                                                                                                                             
## [2] "\177\u07c9YTU,B \x95K\xacĞ\xb7\x8c\xed\x97NoQ\x95\xc9\034\002\032g3\xd1J\x9b\"\001\x9b;m\xec$\023_\xc3\xf7ƓH\x90\x94ժt\0262\xb1\004\024\xbd\xee\xedMg\xb8\xf4\x80\tW[\xccDA䟥ļ\x80Ja\xea<X\x9e\031\xbbP)\xe2\xd70\x91^\xe5S5\001\xd9n6\037e\xee,\x81\xa5\006\xd5\030\xa2\xdby\x85\xb1\x9a\x95\x94\xbc-\xf8\xf3J\xc9\xc8X\x91\xbc\xac\xd6\xd5T\x99Pޗ&W\xc4B\xe5\xdc\xea?$"
## [3] "7\036\x9b\034\xb4\xcbg\025C\xa7\xe8\003(\x8d\005"                                                                                                                                                                                                                                                                                                             
## [4] "I_\005\xfaP\025{\x97\x8bR~\xbb0\03597M\017\x83\x9cښآ\xb4R\xc6nt\037\xe0\x8f\x8bQơua!\xb5\xbf\b|\xa2\x8e\xf6\x95踿\022\035\017\xff\xa4\x83\xf8\xcf\xf3\x8fF\x849r\020\x90\x96%९C\004=\xc6\\\xa8"                                                                                                                                                                                    
## [5] "z\a\xb7\x8c\xff\x96\xee\017"

存在编码问题。

read_excel(
  get_path02("Company Operation.xls"),
  # n_max = 100,
  skip = 1,
  sheet = "CN"
) %>% 
  rename(col = `指标`) %>% 
  distinct(`col`) %>% 
  filter(col %in% str_subset(col,"机场|航空")) %>% 
  mutate(col = str_extract(col,"^\\p{Han}{2,}")) %>% 
  distinct(col)
## # A tibble: 6 x 1
##   col     
##   <chr>   
## 1 白云机场
## 2 东方航空
## 3 海南航空
## 4 南方航空
## 5 上海机场
## 6 深圳机场

六个目标

3 EDA

3.1 武神数据

  • 数据分别从balance sheet,cash flow statement,income statement提取机场和航空公司一共6家公司的数据,然后合并得来。
  • 合并时按照TICKER_SYMBOL_股票代码,END_DATE_截止日期进行合并的。
finacial_data_airport_airline <- read_csv(get_path04("20180613_wu_finacial_data_airport_airline.csv"))
# read_lines(get_path04("20180613_wu_finacial_data_airport_airline.csv"),n_max = 2)

这里的导出注意utf-8格式1

finacial_data_airport_airline_edit <- 
finacial_data_airport_airline %>% 
    select(-X1) %>%
    rename_all(
      funs(
        str_remove_all(., "\\p{Han}{1,}") %>% 
        str_remove_all(., "_$") %>% 
        str_to_lower()
      )
  )

这里使用REVENUE_营业收入作为y变量2

3.1.1 Overview

EDA 主要参考 SmartEDA 包 测评Tidyverse使用技巧

ExpData(data=finacial_data_airport_airline_edit,type=1) %>% datatable()
ExpData(data=finacial_data_airport_airline_edit,type=2) %>% datatable()

3.1.2 连续变量

3.1.2.1 table

ExpNumStat(finacial_data_airport_airline_edit,by="A",gp="revenue",Qnt=seq(0,1,0.1),MesofShape=2,Outlier=TRUE,round=2) %>% 
    mutate_at(vars(Per_of_Missing),percent) %>% datatable()
  • Vname – Variable name - 变量名称
  • Group – Target variable -
    • TN – Total sample (inculded NA observations) - 样本总数
  • nNeg – Total negative observations - 负样本数量
  • nZero – Total zero observations - 零值数量
  • nPos – Total positive observations - 正样本数量
  • NegInf – Negative infinite count - 负无穷大极值
  • PosInf – Positive infinite count - 正无穷大极值
  • NA_value – Not Applicable count - 缺失值
  • Per_of_Missing – Percentage of missings - 缺失率
  • Min – minimum value - 最小值
  • Max – maximum value - 最大值
  • Mean – average value - 平均值
  • Median – median value - 中位数
  • SD – Standard deviation - 总体标准差
  • CV – coefficient of variations (SD/mean)*100 - z score
  • IQR – Inter quartile range - 四分位距 \(QD = \frac{Q3-Q1}{2}\)
    • Qnt – Specified quantiles - 百分位点
  • MesofShape – Skewness and Kurtosis - 偏度和峰度
  • Outlier – Number of outliers - 异常值数量
  • Cor – Correlation b/w target and independent variables - 自变量和因变量相关性

3.1.2.2 plot

ExpNumViz(finacial_data_airport_airline_edit,gp=NULL,nlim=10,Page=c(2,2),sample=NULL)
## $`0`

  • nlim: 少于10个样本不画图
  • sample: 随机选择变量进行展示,NULL表示全部展示
ExpNumViz(finacial_data_airport_airline_edit,gp="t_revenue",nlim=4,fname=NULL,col=NULL,Page=c(2,2))
## $`0`

3.1.3 分类变量

3.1.3.1 table

ExpCTable(finacial_data_airport_airline_edit,Target="t_revenue",margin=1,clim=10,nlim=NULL,round=2,bin=4,per=F) %>% datatable()

3.1.3.2 plot

ExpCatViz(finacial_data_airport_airline_edit,gp=NULL,fname=NULL,clim=10,col=NULL,margin=2,Page = c(2,1))
## $`0`

4 异常值

# library(readr)
# finacial_all <- read_csv("ana_data/20180619_wu_finacial_all.csv", 
#     col_names = FALSE)
finacial_all <- read_excel("ana_data/20180619_wu_finacial_all.xlsx")
object_size(finacial_all)
## 192 MB
remove_outliers <- function(x, na.rm = TRUE, ...) {
  qnt <- quantile(x, probs=c(.25, .75), na.rm = na.rm, ...)
  H <- 1.5 * IQR(x, na.rm = na.rm)
  y <- x
  y[x < (qnt[1] - H)] <- NA
  y[x > (qnt[2] + H)] <- NA
  y
}
finacial_all_edited <- 
finacial_all %>% 
    rename_all(
      funs(
        str_remove_all(., "\\p{Han}{1,}") %>% 
        str_remove_all(., "_$") %>% 
        str_to_lower()
      )
  ) %>% 
    select(ticker_symbol,end_date,t_revenue)
finacial_all_edited %>% 
    group_by(ticker_symbol) %>% 
    summarise(n = n()) %>% 
    arrange(desc(n)) %>% datatable()
finacial_all_edited %>% 
    group_by(ticker_symbol) %>% 
    filter(
        t_revenue < quantile(t_revenue,probs = 0.25,na.rm=T) - 3 * IQR(t_revenue,na.rm=T) |
        t_revenue > quantile(t_revenue,probs = 0.75,na.rm=T) + 3 * IQR(t_revenue,na.rm=T)
    ) %>% 
    write_csv(get_path04("20180619_li_outiler_list.csv")) %>% 
    group_by(ticker_symbol) %>% 
    summarise(n = n()) %>% 
    arrange(desc(n)) %>% datatable()

这里使用的是3倍的IQR的条件,比箱形图要求严格。


  1. 否则会出现报错。

    Error in make.names(x) : invalid multibyte string 1

  2. finacial_data_airport_airline %>% names() %>% 
        str_subset("收入")
    ## [1] "T_REVENUE_营业总收入"           "REVENUE_营业收入"              
    ## [3] "INT_INCOME_利息收入"            "COMMIS_INCOME_手续费及佣金收入"
    ## [5] "SPEC_OR_营业总收入的调整项目"   "AOR_营业总收入的差错金额"      
    ## [7] "NOPERATE_INCOME_营业外收入"